{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from __future__ import print_function, division\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "import itertools, os, time\n",
      "\n",
      "print(\"Pandas version = \", pd.__version__)\n",
      "print(\"Numpy version =  \", np.__version__)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Pandas version =  0.13.1\n",
        "Numpy version =   1.8.0\n"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Create some toy data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "N_PERIODS = 1E6\n",
      "N_METERS = 5\n",
      "N_MEASUREMENTS_PER_METER = 1\n",
      "\n",
      "meters = ['meter{:d}'.format(i) for i in range(1,N_METERS+1)]\n",
      "meters = [[m]*N_MEASUREMENTS_PER_METER for m in meters]\n",
      "flatten_2d_list = lambda lst: list(itertools.chain(*lst))\n",
      "meters = flatten_2d_list(meters)\n",
      "level2 = ['power', 'power', 'voltage'][:N_MEASUREMENTS_PER_METER] * N_METERS\n",
      "level3 = ['active', 'reactive', ''][:N_MEASUREMENTS_PER_METER] * N_METERS\n",
      "\n",
      "\n",
      "columns = [meters, level2, level3]\n",
      "columns = pd.MultiIndex.from_arrays(columns)\n",
      "rng = pd.date_range('2012', freq='S', periods=N_PERIODS)\n",
      "data = np.random.randint(low=0, high=1000, \n",
      "                         size=(N_PERIODS, \n",
      "                               N_METERS*N_MEASUREMENTS_PER_METER))\n",
      "df = pd.DataFrame(data=data, index=rng, columns=columns, dtype=np.float32)\n",
      "\n",
      "# df.iloc[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Save the data to an HDF5 file, one big table"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "COMPRESSION = 'zlib'\n",
      "\n",
      "filename = 'one_big_table.h5'\n",
      "t0 = time.time()\n",
      "store = pd.HDFStore(filename, 'w', complevel=9, complib=COMPRESSION)\n",
      "store.put('df', df, format='table')\n",
      "store.close()\n",
      "\n",
      "def print_runtime(start):\n",
      "    print('Time = {:.1f}'.format(time.time() - start))\n",
      "\n",
      "def print_filesize(filename):\n",
      "    print('Filesize of \\'{:s}\\' is {:.2f} MBytes'\n",
      "           .format(filename, os.path.getsize(filename) / 1E6))\n",
      "    \n",
      "print_runtime(t0)\n",
      "print_filesize(filename)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Time = 9.1\n",
        "Filesize of 'one_big_table.h5' is 10.47 MBytes\n"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Save the data to an HDF5 file, one table per meter"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "filename = 'one_table_per_meter.h5'\n",
      "t0 = time.time()\n",
      "store = pd.HDFStore(filename, 'w', complevel=9, complib=COMPRESSION)\n",
      "for meter in df.columns.levels[0]:\n",
      "    store.put(meter, df[meter], format='table')\n",
      "store.close()\n",
      "\n",
      "print_runtime(t0)\n",
      "print_filesize(filename)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Time = 14.0\n",
        "Filesize of 'one_table_per_meter.h5' is 16.89 MBytes\n"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### RESULTS\n",
      "\n",
      "Default config is 5 meters, 3 params per meter, 1 million rows, float32 for power data, fixed format\n",
      "\n",
      "#### BLOSC\n",
      "\n",
      "5 meters: 35.57 MBytes versus 39.26 MBytes = 1.10 x larger\n",
      "\n",
      "5 meters, 1 param per meter: 12.49 MB (0.3 sec) vs 16.16 MB (0.4 sec) = 1.29 x larger\n",
      "\n",
      "5 meters, 1 param per meter, table format: 16.48 MB (1.3 sec) vs 22.13 MB (4.6 sec) = 1.34 x larger\n",
      "\n",
      "50 meters: 346.93 MB versus 392.56 MB = 1.13 x larger\n",
      "\n",
      "#### ZLIB\n",
      "\n",
      "(48 seconds compress!)\n",
      "\n",
      "5 meters: 24.80 MB versus 26.77 MB = 1.07 x larger\n",
      "\n",
      "#### BZIP2\n",
      "\n",
      "5 meters, 3 params per meter: 23.73 MB (8.7 sec) versus 25.87 MB (12.1 sec) = 1.09 x larger\n",
      "\n",
      "5 meters, 1 param per meter: 8.28 MB (3.3 sec) vs 10.52MB (6.7 sec) = 1.27 x larger\n",
      "\n",
      "5 meters, 1 param per meter, table format: 10.38 MB (5.3 sec) vs 17.55 MB (13.7 sec) = 1.69 x larger (!)\n",
      "\n",
      "#### LZO\n",
      "\n",
      "5 meters: 30.17 MB (0.3 sec) versus 34.17 MB (0.5 sec) = 1.13 x larger\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}